/*******************************************************************************

This code file cleans the cumulative assessment roll.

*******************************************************************************/

*** Manage settings

	run "~/Dropbox (MIT)/Research/NYC421a/code/modules/settings.do"
	
*** Import data
	
	import delimited "$data/raw/assessments/Property_Valuation_and_Assessment_Data.csv", encoding(ISO-8859-1)
	
*** Clean data to minimal file before merge

	* Keep needed variables
	keep bbl year avtot avland fullval extot exland

	* Destring BBL code
	rename bble bbl
	destring bbl, replace force
	format bbl %18.0f
	
	* Destring years
	split year, parse("/")
	drop year year2
	destring year1, replace force
	rename year year
	
	order bbl year fullval avtot avland
	
	* Drop if missing BBL code or DOF estimated market value is zero
	drop if missing(bbl) | fullval == 0

	* Save to tempfile
	tempfile val_tmp
	save `val_tmp', replace
	
*** Match w/ main dataset

	* First, match directly into the merged PLUTO dataset
	merge m:1 bbl using "$data/clean/pluto_merged.dta", nogen keep(3)
	tempfile val_tmp_matched
	save `val_tmp_matched', replace
	
	* Then go back and fix unmatched data (condos)
	use `val_tmp', clear
	merge m:1 bbl using "$data/clean/pluto_merged.dta", nogen keep(1)
	mmerge bbl using "$data/clean/xwalk_condo_bbl.dta"
	keep if _m == 3
	drop _m
	
	replace bbl = bbl_condo if !missing(bbl_condo)
	keep if !missing(bbl_condo)
	keep bbl year fullval avtot avland exland extot
	
	collapse (sum) fullval avtot avland exland extot, by(bbl year)
		
	merge m:1 bbl using "$data/clean/pluto_merged.dta", nogen keep(3)
	
	append using `val_tmp_matched'
	
*** Model underassessment

	gen fraction = avtot/fullval
	gen logodds_fraction = ln(fraction / (1-fraction))
	
	xtset bbl year
	gen dfraction = d.fraction
	bys bbl: egen hasjump = max((dfraction > 0.2)*!missing(dfraction))
	bys bbl: egen year_jump = max(year*(dfraction > 0.2)*!missing(dfraction)) if hasjump == 1
	bys bbl: gen post_jump = (year >= year_jump) | hasjump == 0
	xtset, clear
	
	drop if post_jump == 0
	
	bys bbl: egen unitstotal_ = mode(unitstotal)
	replace unitstotal = unitstotal_ if missing(unitstotal)
	drop unitstotal_
	
	gen identifier = .
	replace identifier = 1 if unitstot == 3
	replace identifier = 2 if unitstot > 3 & unitstot < 11
	replace identifier = 3 if unitstot >= 11 & !missing(unitstot)
	
	gen age = year - max(yearbuilt,year_jump)
	replace age = year - (yearpermit + 3) if age > 15
	drop if age < 0
	
	reghdfe logodds_fraction if post_jump == 1, a(bbl c.age#i.bbl i.identifier#i.age, savefe) resid(resid)
	local constant = _b[_cons]
	
	gen logodds_fraction_pred = logodds_fraction - resid
	drop resid
	
	drop if missing(age)
	
	drop year_jump hasjump dfraction
	
*** Create panel of BBLs

	preserve

	keep bbl
	duplicates drop
	expand 26
	
	gen age = 0
	bys bbl: replace age = age[_n-1]+1 if !missing(age[_n-1])
	
	tempfile tmp_panel
	save `tmp_panel', replace
	
	restore
	
	merge m:1 bbl age using `tmp_panel', nogen
	
	bys bbl: egen identifier_ = mean(identifier)
	drop identifier
	rename identifier_ identifier
	
*** Extrapolation for t > 15

	gen age_effect = .

	forvalues i = 1/3 {
		
		tabstat __hdfe3__ if identifier == `i', by(age) stat(mean) save
		
		mat coef10 = r(Stat11)
		local coef10 = coef10[1,1]
		
		mat coef15 = r(Stat16)
		local coef15 = coef15[1,1]
		
		gen age_effect_ = __hdfe3__
		replace age_effect_ = `coef15' + ((`coef15' - `coef10')/5) * (age - 15) if identifier == `i' & age >= 15 & !missing(age) 
		
		bys age identifier: egen tmp = mean(age_effect_)
		replace age_effect = tmp if !missing(tmp) & missing(age_effect)
		drop age_effect_ tmp
	}
	
	bys bbl: egen intercept = mean(__hdfe1__)
	bys bbl: egen slope = mean(__hdfe2__Slope1)
	
	bys nta identifier: egen slope_m = mean(__hdfe2__Slope1)
	bys bbl: egen final_age = max(age*!missing(fraction))
	bys bbl: egen fraction_at_final_age = max(fraction*(age==final_age))
	gen logodds_frac_at_final_age = ln(fraction_at_final_age/(1-fraction_at_final_age))
	
	gen constant = `constant'
	
	drop __hdfe1__ __hdfe2__Slope1 __hdfe3__
	
	replace logodds_fraction_pred = constant + intercept + age_effect + slope*age if missing(logodds_fraction_pred)
	replace logodds_fraction_pred = constant + logodds_frac_at_final_age + age_effect + slope_m*age if missing(logodds_fraction_pred)
	gen fraction_pred = exp(logodds_fraction_pred)/(1+exp(logodds_fraction_pred))
	
	replace fraction_pred = min(fraction_pred, 0.06) if identifier == 1 & !missing(fraction_pred)
	replace fraction_pred = min(fraction_pred, 0.45) if (identifier == 2 | identifier == 3) & !missing(fraction_pred)
	
	bys bbl: egen minyear = min(year)
	bys bbl: egen age_at_minyear = max(age*(year==minyear))
	replace year = minyear + (age - age_at_minyear) if missing(year)
	
	drop minyear age_at_minyear slope_m fraction_at_final_age
	
*** Save panel
	
	* File 1: 
	
	preserve
	
	keep bbl year age fraction_pred identifier
	
	save "$data/raw/assessments/av_fraction_history.dta", replace
	
	restore
	
	* File 2: 
	
	preserve
	
	keep bbl year age fraction_pred identifier
	
	drop year
	duplicates drop bbl id age, force
	reshape wide fraction, i(bbl id) j(age)
	
	drop identifier
	
	save "$data/raw/assessments/av_fraction_history_wide.dta", replace
	
	restore
	
	* File 3:
	
	preserve
	
	keep bbl year fraction_pred fraction
	
	use "$data/raw/assessments/av_fraction_history.dta", clear
	
	keep if year == 2018
	
	duplicates drop bbl, force
	
	gen sh_taxable = fraction
	replace sh_taxable = fraction_pred if missing(sh_taxable)

	save "$data/raw/assessments/av_fraction_mostrecent.dta", replace
	
	restore
	
